#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <utility>
#include <string>
#include <vector>
#include <tuple>
#include <queue>

bool is_whitespace(char ch) noexcept {
    // " \t\n\r\x0b\x0c" (python string.whitespace)
    return ch == 32 or (9 <= ch and ch <= 13);
//    return ch <= 32; // arguably too general, but slightly faster
}

bool is_punctuation(char c) noexcept {
    // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,  91-96,  123-126
    return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126);
}

// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
// Returns a LARGE array of ngrams
std::vector<std::string> clean_ngram(
    std::string const & input, std::string const & ignore, size_t ngram_n
) noexcept {

    size_t num_grams = 0;
    std::vector<std::string> ngram_list;
    std::vector<uint8_t> gram_lengths;
    std::string current_ngram;

    // Max gram length is set to 10 below.
    current_ngram.reserve(11*ngram_n);
    gram_lengths.reserve(ngram_n);

    bool started_gram = false;
    gram_lengths.push_back(0);

    //for (size_t i=0; i<input.length(); i++) {
    // this is slightly faster, and we don't need the index in this one
    for (auto iter = input.begin(); iter != input.end(); iter++) {

        // If whitespace, end the current ngram and start the next
        // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ... }
        if (is_whitespace(*iter) || gram_lengths.back() > 10) {

            // Skip all whitespace
            while (++iter != input.end() && is_whitespace(*iter));
            iter--;

            if (started_gram){
                num_grams += 1;

                // Building 1grams is a special case
                if (ngram_n == 1){
                    ngram_list.push_back(current_ngram);
                    current_ngram = current_ngram.substr(gram_lengths.front());
                    gram_lengths.back() = 0;

                // If there are enough grams to form an ngram, save
                } else if (num_grams >= ngram_n){
                    // Save the current ngram
                    ngram_list.push_back(current_ngram);

                    // Start the next ngram by dropping the first gram and its space from the ngram
                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
                    current_ngram += ' ';

                    // Drop the length of the first gram and prepare to record the length of the new gram
                    gram_lengths.erase(gram_lengths.begin());
                    gram_lengths.push_back(0);

                // Otherwise, continute building
                } else {
                    current_ngram += ' ';
                    gram_lengths.push_back(0);
                }

                started_gram = false;
            }


        // Skip ignored characters
        // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue;
        } else if (ignore.find(*iter) != std::string::npos) {
            continue;
        }

        // If it is a non-ignored character, add it to the ngram and update the last gram's length
        else {
            current_ngram += tolower(*iter);
            gram_lengths.back() += 1;
            started_gram = true;
        }
    }

    return ngram_list;
}


// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
// Returns a LARGE array of tuples of (ngram, start_idx, end_idx)
std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
    std::string const & input, std::string const & ignore, size_t ngram_n
) noexcept {

    size_t num_grams = 0;
    std::vector<std::tuple<std::string, size_t, size_t> > ngram_list;
    std::vector<uint8_t> gram_lengths;
    std::vector<size_t> gram_start_indices;
    std::string current_ngram;

    // Max gram length is set to 10 below.
    current_ngram.reserve(11*ngram_n);

    bool started_gram = false;
    gram_lengths.push_back(0);
    gram_start_indices.push_back(0);

    for (size_t i=0; i<input.length(); i++) {
        char ch = input[i];

        // If whitespace, end the current ngram and start the next
        if (is_whitespace(ch) || gram_lengths.back() > 10) {

            // Skip all whitespace
            while (++i < input.length() && is_whitespace(input[i]));
            i--;

            if (started_gram){
                num_grams += 1;

                // Building 1grams is a special case
                if (ngram_n == 1){
                    ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i));
                    current_ngram = current_ngram.substr(gram_lengths.front());
                    gram_lengths.back() = 0;
                    gram_start_indices.back() = i+1;

                // If there are enough grams to form an ngram, save
                } else if (num_grams >= ngram_n){

                    // Save the current ngram
                    ngram_list.push_back(
                        std::make_tuple(current_ngram, gram_start_indices.front(), i)
                    );

                    // Start the next ngram by dropping the first gram and its space from the ngram
                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
                    current_ngram += ' ';

                    // Drop the length of the first gram and prepare to record the length of the new gram
                    gram_lengths.erase(gram_lengths.begin());
                    gram_lengths.push_back(0);

                    gram_start_indices.erase(gram_start_indices.begin());
                    gram_start_indices.push_back(i+1);

                // Otherwise, continute building
                } else {
                    current_ngram += ' ';
                    gram_lengths.push_back(0);
                    gram_start_indices.push_back(i+1);
                }

                started_gram = false;
            }

        // Skip ignored characters
        } else if (ignore.find(*iter) != std::string::npos) {
            continue;

        // If it is a non-ignored character, add it to the ngram and update the last gram's length
        } else {
            current_ngram += tolower(ch);
            gram_lengths.back() += 1;
            started_gram = true;
        }
    }

    return ngram_list;
}


PYBIND11_MODULE(janitor_util, m) {
    m.doc() = "pybind11 example plugin"; // optional module docstring
//    m.def("add", &add, "A function which adds two numbers");  // example function
    m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters");
    m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters");
}

// Example compile
// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
// If python and gcc aren't linked, append to the above:    -undefined dynamic_lookup